#import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns
%matplotlib inline
import plotly
import plotly.express as px
import plotly.graph_objs as go
import plotly.offline as py
from plotly.offline import iplot
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
# load in the dataset into a pandas dataframe
songs = pd.read_csv('songs_normalize.csv')
songs.head()
| artist | song | duration_ms | explicit | year | popularity | danceability | energy | key | loudness | mode | speechiness | acousticness | instrumentalness | liveness | valence | tempo | genre | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Britney Spears | Oops!...I Did It Again | 211160 | False | 2000 | 77 | 0.751 | 0.834 | 1 | -5.444 | 0 | 0.0437 | 0.3000 | 0.000018 | 0.3550 | 0.894 | 95.053 | pop |
| 1 | blink-182 | All The Small Things | 167066 | False | 1999 | 79 | 0.434 | 0.897 | 0 | -4.918 | 1 | 0.0488 | 0.0103 | 0.000000 | 0.6120 | 0.684 | 148.726 | rock, pop |
| 2 | Faith Hill | Breathe | 250546 | False | 1999 | 66 | 0.529 | 0.496 | 7 | -9.007 | 1 | 0.0290 | 0.1730 | 0.000000 | 0.2510 | 0.278 | 136.859 | pop, country |
| 3 | Bon Jovi | It's My Life | 224493 | False | 2000 | 78 | 0.551 | 0.913 | 0 | -4.063 | 0 | 0.0466 | 0.0263 | 0.000013 | 0.3470 | 0.544 | 119.992 | rock, metal |
| 4 | *NSYNC | Bye Bye Bye | 200560 | False | 2000 | 65 | 0.614 | 0.928 | 8 | -4.806 | 0 | 0.0516 | 0.0408 | 0.001040 | 0.0845 | 0.879 | 172.656 | pop |
songs.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2000 entries, 0 to 1999 Data columns (total 18 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 artist 2000 non-null object 1 song 2000 non-null object 2 duration_ms 2000 non-null int64 3 explicit 2000 non-null bool 4 year 2000 non-null int64 5 popularity 2000 non-null int64 6 danceability 2000 non-null float64 7 energy 2000 non-null float64 8 key 2000 non-null int64 9 loudness 2000 non-null float64 10 mode 2000 non-null int64 11 speechiness 2000 non-null float64 12 acousticness 2000 non-null float64 13 instrumentalness 2000 non-null float64 14 liveness 2000 non-null float64 15 valence 2000 non-null float64 16 tempo 2000 non-null float64 17 genre 2000 non-null object dtypes: bool(1), float64(9), int64(5), object(3) memory usage: 267.7+ KB
# sort name alphabetically
songs= songs.reindex(sorted(songs.columns), axis=1)
songs.sample(20)
| acousticness | artist | danceability | duration_ms | energy | explicit | genre | instrumentalness | key | liveness | loudness | mode | popularity | song | speechiness | tempo | valence | year | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 647 | 0.02320 | Girls Aloud | 0.681 | 201590 | 0.908 | False | pop, Dance/Electronic | 0.000000 | 11 | 0.3550 | -3.918 | 0 | 49 | Something Kinda Ooooh | 0.0473 | 131.925 | 0.879 | 2012 |
| 277 | 0.00110 | DJ Sammy | 0.571 | 233600 | 0.953 | False | pop | 0.000606 | 8 | 0.2260 | -5.601 | 1 | 63 | Heaven | 0.0548 | 137.965 | 0.601 | 2002 |
| 1821 | 0.04410 | Clean Bandit | 0.737 | 222653 | 0.636 | False | pop, Dance/Electronic | 0.000067 | 11 | 0.3500 | -4.546 | 0 | 71 | Solo (feat. Demi Lovato) | 0.0437 | 105.005 | 0.565 | 2018 |
| 1045 | 0.02050 | Trey Songz | 0.845 | 242013 | 0.601 | True | hip hop, pop, R&B | 0.000000 | 1 | 0.3850 | -5.283 | 1 | 0 | Bottoms Up (feat. Nicki Minaj) | 0.1610 | 74.008 | 0.329 | 2010 |
| 280 | 0.02640 | Red Hot Chili Peppers | 0.451 | 216933 | 0.970 | False | rock | 0.003550 | 0 | 0.1020 | -4.938 | 1 | 73 | By the Way | 0.1070 | 122.444 | 0.198 | 2002 |
| 949 | 0.04760 | A.R. Rahman | 0.657 | 222400 | 0.941 | False | set() | 0.000000 | 8 | 0.0797 | -3.919 | 0 | 65 | Jai Ho! (You Are My Destiny) | 0.0610 | 136.202 | 0.879 | 2009 |
| 6 | 0.03020 | Eminem | 0.949 | 284200 | 0.661 | True | hip hop | 0.000000 | 5 | 0.0454 | -4.244 | 0 | 86 | The Real Slim Shady | 0.0572 | 104.504 | 0.760 | 2000 |
| 925 | 0.29500 | Jeremih | 0.677 | 226506 | 0.523 | False | hip hop, pop, R&B | 0.000000 | 7 | 0.1500 | -5.603 | 0 | 67 | Birthday Sex | 0.0439 | 60.019 | 0.446 | 2009 |
| 1566 | 0.30400 | R. City | 0.509 | 227480 | 0.671 | False | pop | 0.000000 | 1 | 0.0452 | -5.709 | 1 | 77 | Locked Away (feat. Adam Levine) | 0.0678 | 118.413 | 0.550 | 2015 |
| 1286 | 0.09600 | alt-J | 0.616 | 227080 | 0.656 | False | rock | 0.000879 | 5 | 0.2050 | -7.298 | 1 | 71 | Breezeblocks | 0.0344 | 150.071 | 0.286 | 2012 |
| 954 | 0.24900 | Drake | 0.457 | 357706 | 0.906 | True | hip hop, pop, R&B | 0.000000 | 5 | 0.1820 | -2.278 | 0 | 73 | Forever | 0.3420 | 104.020 | 0.540 | 2009 |
| 589 | 0.01930 | Eminem | 0.520 | 296880 | 0.768 | True | hip hop | 0.000340 | 8 | 0.1040 | -3.489 | 0 | 67 | Like Toy Soldiers | 0.3590 | 79.178 | 0.398 | 2004 |
| 139 | 0.15300 | Christina Milian | 0.872 | 231213 | 0.868 | False | hip hop, pop, R&B | 0.000023 | 10 | 0.8430 | -3.036 | 0 | 60 | AM To PM | 0.1200 | 105.005 | 0.822 | 2001 |
| 1120 | 0.06460 | Wiz Khalifa | 0.684 | 217666 | 0.834 | True | hip hop, pop | 0.000000 | 2 | 0.2710 | -4.524 | 0 | 75 | Black and Yellow | 0.0675 | 164.020 | 0.538 | 2011 |
| 626 | 0.00952 | The Pussycat Dolls | 0.938 | 229360 | 0.735 | False | pop, R&B | 0.000000 | 7 | 0.0998 | -6.382 | 1 | 57 | Beep | 0.0434 | 103.700 | 0.550 | 2005 |
| 1434 | 0.02150 | Calvin Harris | 0.603 | 224506 | 0.861 | False | hip hop, pop, Dance/Electronic | 0.043000 | 4 | 0.1000 | -3.565 | 0 | 21 | Summer | 0.0325 | 127.962 | 0.720 | 2014 |
| 1017 | 0.25500 | DJ Fresh | 0.451 | 192446 | 0.948 | False | pop, Dance/Electronic | 0.000000 | 0 | 0.3920 | -0.740 | 1 | 63 | Gold Dust - Radio Edit | 0.1470 | 176.985 | 0.295 | 2010 |
| 175 | 0.00284 | DB Boulevard | 0.676 | 231166 | 0.715 | False | Dance/Electronic | 0.074600 | 6 | 0.0685 | -6.854 | 1 | 0 | Point Of View - Radio Edit | 0.0287 | 129.006 | 0.275 | 2018 |
| 787 | 0.01080 | T-Pain | 0.451 | 227960 | 0.550 | False | hip hop, pop, R&B | 0.000000 | 1 | 0.0737 | -8.137 | 1 | 2 | Buy U a Drank (Shawty Snappin') (feat. Yung Joc) | 0.2620 | 80.001 | 0.594 | 2007 |
| 340 | 0.03910 | Lumidee | 0.811 | 184906 | 0.657 | False | pop, R&B | 0.712000 | 6 | 0.0798 | -6.197 | 1 | 61 | Never Leave You (Uh Oooh, Uh Oooh) | 0.3620 | 199.958 | 0.777 | 2003 |
songs.describe()
| acousticness | danceability | duration_ms | energy | instrumentalness | key | liveness | loudness | mode | popularity | speechiness | tempo | valence | year | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 2000.000000 | 2000.000000 | 2000.000000 | 2000.000000 | 2000.000000 | 2000.000000 | 2000.000000 | 2000.000000 | 2000.000000 | 2000.000000 | 2000.000000 | 2000.000000 | 2000.000000 | 2000.00000 |
| mean | 0.128955 | 0.667437 | 228748.124500 | 0.720366 | 0.015226 | 5.378000 | 0.181216 | -5.512435 | 0.553500 | 59.872500 | 0.103568 | 120.122558 | 0.551690 | 2009.49400 |
| std | 0.173346 | 0.140416 | 39136.569008 | 0.152745 | 0.087771 | 3.615059 | 0.140669 | 1.933482 | 0.497254 | 21.335577 | 0.096159 | 26.967112 | 0.220864 | 5.85996 |
| min | 0.000019 | 0.129000 | 113000.000000 | 0.054900 | 0.000000 | 0.000000 | 0.021500 | -20.514000 | 0.000000 | 0.000000 | 0.023200 | 60.019000 | 0.038100 | 1998.00000 |
| 25% | 0.014000 | 0.581000 | 203580.000000 | 0.622000 | 0.000000 | 2.000000 | 0.088100 | -6.490250 | 0.000000 | 56.000000 | 0.039600 | 98.985750 | 0.386750 | 2004.00000 |
| 50% | 0.055700 | 0.676000 | 223279.500000 | 0.736000 | 0.000000 | 6.000000 | 0.124000 | -5.285000 | 1.000000 | 65.500000 | 0.059850 | 120.021500 | 0.557500 | 2010.00000 |
| 75% | 0.176250 | 0.764000 | 248133.000000 | 0.839000 | 0.000068 | 8.000000 | 0.241000 | -4.167750 | 1.000000 | 73.000000 | 0.129000 | 134.265500 | 0.730000 | 2015.00000 |
| max | 0.976000 | 0.975000 | 484146.000000 | 0.999000 | 0.985000 | 11.000000 | 0.853000 | -0.276000 | 1.000000 | 89.000000 | 0.576000 | 210.851000 | 0.973000 | 2020.00000 |
sum(songs.duplicated())
59
songs.dtypes
acousticness float64 artist object danceability float64 duration_ms int64 energy float64 explicit bool genre object instrumentalness float64 key int64 liveness float64 loudness float64 mode int64 popularity int64 song object speechiness float64 tempo float64 valence float64 year int64 dtype: object
# rename some columns
songs.rename(columns={'explicit':'Explicit_Content', 'duration_ms ':'Song_Duration ','tempo':'Song_Pace'}, inplace = True)
## change column names to uppercase
songs.columns = songs.columns.str.upper()
songs.head()
| ACOUSTICNESS | ARTIST | DANCEABILITY | DURATION_MS | ENERGY | EXPLICIT_CONTENT | GENRE | INSTRUMENTALNESS | KEY | LIVENESS | LOUDNESS | MODE | POPULARITY | SONG | SPEECHINESS | SONG_PACE | VALENCE | YEAR | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.3000 | Britney Spears | 0.751 | 211160 | 0.834 | False | pop | 0.000018 | 1 | 0.3550 | -5.444 | 0 | 77 | Oops!...I Did It Again | 0.0437 | 95.053 | 0.894 | 2000 |
| 1 | 0.0103 | blink-182 | 0.434 | 167066 | 0.897 | False | rock, pop | 0.000000 | 0 | 0.6120 | -4.918 | 1 | 79 | All The Small Things | 0.0488 | 148.726 | 0.684 | 1999 |
| 2 | 0.1730 | Faith Hill | 0.529 | 250546 | 0.496 | False | pop, country | 0.000000 | 7 | 0.2510 | -9.007 | 1 | 66 | Breathe | 0.0290 | 136.859 | 0.278 | 1999 |
| 3 | 0.0263 | Bon Jovi | 0.551 | 224493 | 0.913 | False | rock, metal | 0.000013 | 0 | 0.3470 | -4.063 | 0 | 78 | It's My Life | 0.0466 | 119.992 | 0.544 | 2000 |
| 4 | 0.0408 | *NSYNC | 0.614 | 200560 | 0.928 | False | pop | 0.001040 | 8 | 0.0845 | -4.806 | 0 | 65 | Bye Bye Bye | 0.0516 | 172.656 | 0.879 | 2000 |
top_5_songs = songs.sort_values(by ='POPULARITY',ascending = False)
top_5_songs[['SONG','POPULARITY']].head(5);
top_5_songs.head(5)
| ACOUSTICNESS | ARTIST | DANCEABILITY | DURATION_MS | ENERGY | EXPLICIT_CONTENT | GENRE | INSTRUMENTALNESS | KEY | LIVENESS | LOUDNESS | MODE | POPULARITY | SONG | SPEECHINESS | SONG_PACE | VALENCE | YEAR | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1322 | 0.04950 | The Neighbourhood | 0.612 | 240400 | 0.807 | False | rock, pop | 0.017700 | 10 | 0.1010 | -2.810 | 1 | 89 | Sweater Weather | 0.0336 | 124.053 | 0.398 | 2013 |
| 1311 | 0.69500 | Tom Odell | 0.445 | 244360 | 0.537 | True | pop | 0.000017 | 4 | 0.0944 | -8.532 | 0 | 88 | Another Love | 0.0400 | 122.769 | 0.131 | 2013 |
| 201 | 0.00286 | Eminem | 0.908 | 290320 | 0.669 | True | hip hop | 0.000000 | 7 | 0.2370 | -2.827 | 1 | 87 | Without Me | 0.0738 | 112.238 | 0.662 | 2002 |
| 1613 | 0.03710 | WILLOW | 0.764 | 196520 | 0.705 | False | pop, R&B, Dance/Electronic | 0.000019 | 3 | 0.0943 | -5.279 | 0 | 86 | Wait a Minute! | 0.0278 | 101.003 | 0.672 | 2015 |
| 6 | 0.03020 | Eminem | 0.949 | 284200 | 0.661 | True | hip hop | 0.000000 | 5 | 0.0454 | -4.244 | 0 | 86 | The Real Slim Shady | 0.0572 | 104.504 | 0.760 | 2000 |
fig = px.histogram(songs, x="ENERGY")
fig.show()
0.78-0.799 has the highest frequency
#Most Frequent Artist
plt.figure(figsize=(16,8))
plt.title("Frequent Artist", fontsize=30)
plt.xlabel('COUNT', fontsize=20)
plt.xlabel('ARTIST', fontsize=20)
sns.countplot(songs.ARTIST, order=pd.value_counts(songs.ARTIST).iloc[:15].index, palette=sns.color_palette("coolwarm", 15))
plt.xticks(size=20, rotation=90)
plt.yticks(size=20)
sns.despine(bottom=True, left=True)
plt.show()
Rihanna, Drake and Eminem are among the most frequent artists is the most frequent artist.
fig = px.histogram(songs, x="DURATION_MS", title='Duration Frequency')
fig.show()
The most frequent duration of songs is 210-215 milliseconds with a count of 142 songs
def songs_to_year(YEAR):
decade = (YEAR//10) * 10
decade = f'{decade}s'
return decade
songs['decade'] = songs['YEAR'].apply(lambda x: songs_to_year(x))
fig = plt.figure(figsize = (20,8),facecolor = 'white')
gs = fig.add_gridspec(1,1)
ax = fig.add_subplot(gs[0,0])
ax.text(-0.5, 1500,
'Songs By Decade',
fontsize=30,
fontweight='bold',
fontfamily='monospace'
)
ax.grid(color='black', linestyle=':', axis='y', zorder=0, dashes=(1,5))
colormap = ["lightblue" for _ in range(12)]
colormap[8] = "#0ddb22"
sns.countplot(data = songs, x ='decade', ax = ax, palette = colormap, alpha = 1, zorder = 2)
for direction in ['top','right','left']:
ax.spines[direction].set_visible(False)
ax.set_xlabel('Decade', fontsize = 14, fontweight = 'bold')
ax.tick_params(axis = 'x', labelsize=14)
ax.tick_params(axis = 'y', length=0, labelsize=13)
ax.set_ylabel('',)
plt.show()
The 2010's had the highest number of songs
px.scatter(songs,x='ENERGY',y='ACOUSTICNESS',color_continuous_scale=px.colors.sequential.Plasma, template='plotly_dark',color="ENERGY",title='<b>Energy vs Acousticness')
There is a strong negative correlation between energy and acousticness.
top_10_music_genres = songs[["POPULARITY", "SONG", "GENRE", "YEAR"]].sort_values(["POPULARITY"], ascending=False)[:10]
top_10_music_genres
fig = px.scatter(top_10_music_genres, y= 'SONG', x='POPULARITY', hover_data = top_10_music_genres[['GENRE', 'YEAR']], color='GENRE',
title = "Top 10 MOST POPULAR MUSIC GENRES FROM 2000-2019")
fig.show()
Rock, pop is the most popular genre
top_10_artist = songs[["POPULARITY", "SONG", "ARTIST", "YEAR"]].sort_values(["POPULARITY"], ascending=False)[:10]
top_10_artist
fig = px.scatter(top_10_artist, y= 'SONG', x='POPULARITY', hover_data = top_10_artist[['ARTIST', 'YEAR']], color='ARTIST',
title = "Top 10 MOST POPULAR ARTISTS FROM 2000-2019")
fig.show()
The top 5 most popular song on Spotify between 2000-2019 are:
least_10_artist = songs.sort_values(by ='POPULARITY',ascending = False)
least_10_artist[['SONG','ARTIST']].head(10);
fig = px.histogram(data_frame=songs, x="ARTIST",y="POPULARITY",title="Artist Popularity",opacity=0.8)
fig.show()
Camille Jones has the least popular song
px.scatter(songs, x="POPULARITY", y="DANCEABILITY", color="DANCEABILITY",size='POPULARITY', title='Popularity and Danceability Correlation Plot')
As the popularity of the song increases, the danceability score for that song also increases.
fig=px.treemap(songs, path=[px.Constant('Singer'), 'ARTIST','GENRE','SONG'], values='POPULARITY', title='TreeMap Of Artists Playlist')
fig.update_traces(root_color='lightgreen')
fig.update_layout(title_x=0.5)
The artist with the most songs is Rihanna and the lowest is Camille Jones
fig=px.area(songs.groupby('YEAR', as_index=False).count().sort_values(by='SONG', ascending=False).sort_values(by='YEAR'), x='YEAR', y='SONG', markers=True,labels={'SONG':'Total Songs'}, color_discrete_sequence=['green'], title='Year by Year Song Collection')
fig.update_layout(hovermode= 'x', title_x=0.5)
The year with the hghest song collection is 2012
def songs_to_year(YEAR):
decade = (YEAR//10) * 10
decade = f'{decade}s'
return decade
songs['decade'] = songs['YEAR'].apply(lambda x: songs_to_year(x))
px.box(songs, x='EXPLICIT_CONTENT', y='POPULARITY', color='EXPLICIT_CONTENT', template='plotly_dark', color_discrete_sequence=['cyan','magenta'], title='Popularity Based On Explicit Content')
Songs with explicit content are less popular than songs without explicit content
fig=px.pie(songs.groupby('EXPLICIT_CONTENT', as_index=False).count().sort_values(by='SONG', ascending=False), names='EXPLICIT_CONTENT', values='SONG', labels={'SONG':'Total Songs'}, hole=.6, color_discrete_sequence=['green', 'crimson'], template= 'plotly_dark', title='Songs With Explicit Content')
fig.update_layout(title_x=0.5)
Only 27.6% of songs have explicit content
plt.subplots(figsize=(12, 8))
sns.heatmap(songs.corr(), annot= True, square=True)
plt.show()
There is correlation between acousticness and enegy
plt.figure(figsize=(14,6))
sns.boxplot(data=songs, x='decade', y='POPULARITY', hue='EXPLICIT_CONTENT')
plt.title('Relationship Between Popularity and Explicit Content By Decade', fontsize=15, fontweight='bold')
plt.xlabel('decade')
plt.ylabel('POPULARITY');
This was true for the 2000s and 2020s
! jupyter nbconvert UdacityLAST.ipynb --to slides --post serve --no-input --no-prompt